library(tidyverse)
library(knitr)
library(DESeq2)
library(DEGreport)
library(ggrepel)
ggplot2::theme_set(theme_light(base_size = 14))
opts_chunk[["set"]](
cache = FALSE,
dev = c("png", "pdf"),
error = TRUE,
highlight = TRUE,
message = FALSE,
prompt = FALSE,
tidy = FALSE,
warning = FALSE)
se <- readRDS("data/bcbio-se.rds")
metadata <- colData(se) %>%
as_tibble(rownames = NULL) %>%
dplyr::select(-batch, -phenotype)
metrics <- metadata(se)$metrics %>%
left_join(metadata, by = c("sample" = "sample"))
metrics$date_of <- as_factor(metrics$date_of)
metadata
metrics <- metrics %>%
group_by(date_of) %>%
mutate(position = rank(-total_reads))
metrics %>%
ggplot(aes(x = reorder(sample, -total_reads),
y = total_reads/1e6L,
fill = date_of,
group = position)) +
geom_bar(stat = "identity") +
coord_flip() +
scale_y_continuous(name = "mln reads") +
geom_text(aes(label = floor(total_reads/1e6L)), hjust = 0, nudge_y = 0.5)+
xlab("sample") +
ggtitle("Total reads")
The number of mapped reads should correspond to the number of total reads.
metrics %>%
ggplot(aes(x = reorder(sample, -mapped_reads),
y = mapped_reads, fill = date_of)) +
geom_bar(stat = "identity") +
coord_flip() +
scale_y_continuous(name = "mln reads") +
geom_text(aes(label = floor(mapped_reads/1e6L)), hjust = 0, nudge_y = 0.5)+
xlab("sample") +
ggtitle("Mapped reads")
## Mapping rate
#The genomic mapping rate represents the percentage of reads mapping to the reference genome. Low mapping rates are indicative of sample #contamination, poor sequencing quality or other artifacts.
#{r plot_mapping_rate, fig.width = 10, fig.height = 20}
#metrics %>%
# ggplot(aes(x = reorder(sample, -mapped_reads_pct),
# y = mapped_reads_pct, fill = tissue)) +
# geom_bar(stat = "identity") +
# coord_flip() +
# geom_text(aes(label = floor(mapped_reads_pct)), hjust = 0, nudge_y = 0.5)+
# xlab("sample") +
# ggtitle("Mapping rate")
genes_detected <- colSums(assays(se)[["raw"]] > 0) %>% enframe()
colnames(genes_detected) <- c("sample", "n_genes")
metrics <- metrics %>%
left_join(genes_detected, by = c("sample" = "sample"))
metrics %>%
ggplot(aes(x = reorder(sample, -n_genes),
y = n_genes, fill = date_of)) +
geom_bar(stat = "identity") +
coord_flip() +
geom_text(aes(label = n_genes), hjust = 0, nudge_y = 0.5)+
xlab("sample") +
ylim(0, 50000) +
ggtitle("Number of genes")
We should observe a linear trend in the number of genes detected with the number of mapped reads, which indicates that the sample input was not overloaded.
metrics %>%
ggplot(aes(x = log10(total_reads),
y = n_genes,
color = date_of)) +
geom_point(size = 5)+
geom_text_repel(aes(label = sample), size = 10)+
theme(axis.text = element_text(size = 20),
axis.title = element_text(size = 20, face="bold"),
legend.text = element_text(size = rel(1))) +
theme_bw(base_size = 20) +
ggtitle("Gene saturation")
Ideally, at least 60% of total reads should map to exons.
metrics %>%
ggplot(aes(x = reorder(sample, -exonic_rate),
y = exonic_rate * 100,
fill = date_of)) +
geom_bar(stat = "identity") +
geom_text(aes(label = floor(exonic_rate*100)), hjust = 0, nudge_y = 0.5)+
xlab("sample") +
ylab("Exonic rate %") +
ggtitle("Exonic mapping rate") +
coord_flip()
The majority of reads should map to exons and not introns.
metrics %>%
ggplot(aes(x = reorder(sample, -intronic_rate),
y = intronic_rate * 100,
fill = date_of)) +
geom_bar(stat = "identity") +
coord_flip() +
geom_text(aes(label = floor(intronic_rate*100)),
hjust = 0,
nudge_y = 0.5) +
xlab("sample") +
ylab("Intronic rate %")
ggtitle("Intronic mapping rate")
## $title
## [1] "Intronic mapping rate"
##
## attr(,"class")
## [1] "labels"
Samples should have a ribosomal RNA (rRNA) contamination rate below 10%.
metrics %>%
ggplot(aes(x = reorder(sample, -r_rna_rate),
y = r_rna_rate * 100,
fill = date_of)) +
geom_bar(stat = "identity") +
geom_text(aes(label = round(r_rna_rate*100,2)), hjust = 0, nudge_y = 0.5)+
xlab("sample") +
ylab("rRNA rate, %")+
ylim(0, 1) +
ggtitle("rRNA mapping rate") +
coord_flip()
metrics %>%
ggplot(aes(x = reorder(sample, -x5_3_bias),
y = x5_3_bias,
fill = date_of)) +
geom_bar(stat = "identity") +
geom_text(aes(label = x5_3_bias), hjust = 0, nudge_y = 0.01)+
xlab("sample") +
ylim(0, 1.5)+
ggtitle("5'-3' bias") +
coord_flip()
We expected similar spread for every sample.
metadata$date_of <- as_factor(metadata$date_of)
metrics_small <- metrics %>% dplyr::select(sample, date_of)
assays(se)[["raw"]] %>%
as_tibble() %>%
dplyr::filter(rowSums(.)!=0) %>%
gather(sample, counts) %>%
left_join(metadata, by = c("sample" = "sample")) %>%
ggplot(aes(sample, log2(counts+1), fill = date_of)) +
geom_boxplot() +
coord_flip() +
ggtitle("Counts per gene, all non-zero genes")
protein_coding_genes <- read_csv("tables/ensembl_w_description.protein_coding.csv")
assays(se)[["raw"]] %>%
as_tibble(rownames = "ensembl_gene_id") %>%
dplyr::filter(ensembl_gene_id %in% protein_coding_genes$ensembl_gene_id) %>%
dplyr::select(-ensembl_gene_id) %>%
dplyr::filter(rowSums(.)!=0) %>%
gather(sample, counts) %>%
left_join(metadata, by = c("sample" = "sample")) %>%
dplyr::mutate(date_of = as.factor(date_of)) %>%
ggplot(aes(sample, log2(counts+1), fill = date_of)) +
geom_boxplot() +
coord_flip() +
ggtitle("Counts per gene, protein_coding_genes")
raw_counts <- assays(se)[["raw"]] %>%
as_tibble() %>%
dplyr::filter(rowSums(.)!=0) %>%
as.matrix()
vst <- vst(raw_counts)
pca <- degPCA(vst, colData(se), condition = "date_of", name = "sample", data = T)[["plot"]]
pca_labels <- pca[["labels"]]
pca_data <- pca[["data"]] %>% as_tibble() %>%
dplyr::select(sample, PC1, PC2, date_of)
pca_data$date_of <- as.factor(pca_data$date_of)
pca_data %>%
ggplot(aes(x = PC1, y = PC2, color = date_of, label = sample)) +
geom_point(size = 5) +
geom_text_repel(size = 10) +
xlab(pca_labels$x) +
ylab(pca_labels$y) +
ggtitle(paste0("PCA: ", nrow(vst), " genes")) +
theme_bw(base_size = 20) +
theme(axis.text = element_text(size = 20),
axis.title = element_text(size = 20, face="bold"),
legend.text = element_text(size = rel(1)))
raw_counts <- assays(se)[["raw"]] %>%
as_tibble(rownames = "ensembl_gene_id") %>%
dplyr::filter(ensembl_gene_id %in% protein_coding_genes$ensembl_gene_id) %>%
column_to_rownames(var = "ensembl_gene_id") %>%
dplyr::filter(rowSums(.)!=0) %>%
as.matrix()
vst <- vst(raw_counts)
pca <- degPCA(vst, colData(se), condition = "date_of", name = "sample", data = T)[["plot"]]
pca_labels <- pca[["labels"]]
pca_data <- pca[["data"]] %>% as_tibble() %>%
dplyr::select(sample, PC1, PC2, date_of)
pca_data$date_of <- as.factor(pca_data$date_of)
pca_data %>%
ggplot(aes(x = PC1, y = PC2, color = date_of, label = sample)) +
geom_point(size = 5) +
geom_text_repel(size = 10) +
xlab(pca_labels$x) +
ylab(pca_labels$y) +
ggtitle(paste0("PCA:", nrow(vst), " genes")) +
theme_bw(base_size = 20) +
theme(axis.text = element_text(size = 20),
axis.title = element_text(size = 20, face="bold"),
legend.text = element_text(size = rel(1)))
raw_counts <- assays(se)[["raw"]] %>%
as_tibble(rownames = "ensembl_gene_id") %>%
dplyr::filter(ensembl_gene_id %in% protein_coding_genes$ensembl_gene_id) %>%
column_to_rownames(var = "ensembl_gene_id") %>%
dplyr::filter(rowSums(.)!=0) %>%
as.matrix()
vst <- vst(raw_counts)
pca <- degPCA(vst, colData(se), condition = "er", name = "sample", data = T)[["plot"]]
pca_labels <- pca[["labels"]]
pca_data <- pca[["data"]] %>% as_tibble() %>%
dplyr::select(sample, PC1, PC2, er)
pca_data$er <- as.factor(pca_data$er)
pca_data %>%
ggplot(aes(x = PC1, y = PC2, color = er, label = sample)) +
geom_point(size = 5) +
geom_text_repel(size = 10) +
xlab(pca_labels$x) +
ylab(pca_labels$y) +
ggtitle(paste0("PCA:", nrow(vst), " genes")) +
theme_bw(base_size = 20) +
theme(axis.text = element_text(size = 20),
axis.title = element_text(size = 20, face="bold"),
legend.text = element_text(size = rel(1)))
se_pre <- se[,se$treatment == "pre"]
raw_counts <- assays(se_pre)[["raw"]] %>%
as_tibble(rownames = "ensembl_gene_id") %>%
dplyr::filter(ensembl_gene_id %in% protein_coding_genes$ensembl_gene_id) %>%
column_to_rownames(var = "ensembl_gene_id") %>%
dplyr::filter(rowSums(.)!=0) %>%
as.matrix()
vst <- vst(raw_counts)
pca <- degPCA(vst, colData(se_pre), condition = "response", name = "sample", data = T)[["plot"]]
pca_labels <- pca[["labels"]]
pca_data <- pca[["data"]] %>% as_tibble() %>%
dplyr::select(sample, PC1, PC2, response)
pca_data$response <- as.factor(pca_data$response)
pca_data %>%
ggplot(aes(x = PC1, y = PC2, color = response, label = sample)) +
geom_point(size = 5) +
geom_text_repel(size = 10) +
xlab(pca_labels$x) +
ylab(pca_labels$y) +
ggtitle(paste0("PCA:", nrow(vst), " genes")) +
theme_bw(base_size = 20) +
theme(axis.text = element_text(size = 20),
axis.title = element_text(size = 20, face="bold"),
legend.text = element_text(size = rel(1)))
se_pre <- se[,se$treatment == "post"]
raw_counts <- assays(se_pre)[["raw"]] %>%
as_tibble(rownames = "ensembl_gene_id") %>%
dplyr::filter(ensembl_gene_id %in% protein_coding_genes$ensembl_gene_id) %>%
column_to_rownames(var = "ensembl_gene_id") %>%
dplyr::filter(rowSums(.)!=0) %>%
as.matrix()
vst <- vst(raw_counts)
pca <- degPCA(vst, colData(se_pre), condition = "response", name = "sample", data = T)[["plot"]]
pca_labels <- pca[["labels"]]
pca_data <- pca[["data"]] %>% as_tibble() %>%
dplyr::select(sample, PC1, PC2, response)
pca_data$response <- as.factor(pca_data$response)
pca_data %>%
ggplot(aes(x = PC1, y = PC2, color = response, label = sample)) +
geom_point(size = 5) +
geom_text_repel(size = 15) +
xlab(pca_labels$x) +
ylab(pca_labels$y) +
ggtitle(paste0("PCA:", nrow(vst), " genes")) +
theme_bw(base_size = 20) +
theme(axis.text = element_text(size = 20),
axis.title = element_text(size = 20, face="bold"),
legend.text = element_text(size = rel(1)))
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: Fedora 32 (Workstation Edition)
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/libopenblas-r0.3.12.so
##
## locale:
## [1] LC_CTYPE=en_CA.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_CA.UTF-8 LC_COLLATE=en_CA.UTF-8
## [5] LC_MONETARY=en_CA.UTF-8 LC_MESSAGES=en_CA.UTF-8
## [7] LC_PAPER=en_CA.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_CA.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] ggrepel_0.9.1 DEGreport_1.26.0
## [3] DESeq2_1.30.1 SummarizedExperiment_1.20.0
## [5] Biobase_2.50.0 MatrixGenerics_1.2.1
## [7] matrixStats_0.58.0 GenomicRanges_1.42.0
## [9] GenomeInfoDb_1.26.2 IRanges_2.24.1
## [11] S4Vectors_0.28.1 BiocGenerics_0.36.0
## [13] knitr_1.30 forcats_0.5.1
## [15] stringr_1.4.0 dplyr_1.0.5
## [17] purrr_0.3.4 readr_1.4.0
## [19] tidyr_1.1.3 tibble_3.1.0
## [21] ggplot2_3.3.3 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] colorspace_2.0-0 rjson_0.2.20
## [3] ellipsis_0.3.1 circlize_0.4.12
## [5] XVector_0.30.0 ggdendro_0.1.22
## [7] GlobalOptions_0.1.2 fs_1.5.0
## [9] clue_0.3-58 rstudioapi_0.13
## [11] farver_2.1.0 bit64_4.0.5
## [13] AnnotationDbi_1.52.0 fansi_0.4.2
## [15] lubridate_1.7.10 xml2_1.3.2
## [17] splines_4.0.3 logging_0.10-108
## [19] mnormt_2.0.2 cachem_1.0.4
## [21] geneplotter_1.68.0 jsonlite_1.7.1
## [23] Nozzle.R1_1.1-1 Cairo_1.5-12.2
## [25] broom_0.7.5 annotate_1.68.0
## [27] cluster_2.1.0 dbplyr_2.1.0
## [29] png_0.1-7 compiler_4.0.3
## [31] httr_1.4.2 backports_1.2.1
## [33] assertthat_0.2.1 Matrix_1.2-18
## [35] fastmap_1.1.0 limma_3.46.0
## [37] cli_2.3.1 lasso2_1.2-21.1
## [39] htmltools_0.5.1.1 tools_4.0.3
## [41] gtable_0.3.0 glue_1.4.2
## [43] GenomeInfoDbData_1.2.4 Rcpp_1.0.6
## [45] cellranger_1.1.0 vctrs_0.3.6
## [47] nlme_3.1-149 psych_2.0.12
## [49] xfun_0.19 rvest_1.0.0
## [51] lifecycle_1.0.0 XML_3.99-0.5
## [53] edgeR_3.32.1 MASS_7.3-53
## [55] zlibbioc_1.36.0 scales_1.1.1
## [57] hms_1.0.0 RColorBrewer_1.1-2
## [59] ComplexHeatmap_2.6.2 yaml_2.2.1
## [61] memoise_2.0.0 reshape_0.8.8
## [63] stringi_1.5.3 RSQLite_2.2.3
## [65] genefilter_1.72.1 BiocParallel_1.24.1
## [67] shape_1.4.5 rlang_0.4.10
## [69] pkgconfig_2.0.3 bitops_1.0-6
## [71] evaluate_0.14 lattice_0.20-41
## [73] labeling_0.4.2 cowplot_1.1.1
## [75] bit_4.0.4 tidyselect_1.1.0
## [77] plyr_1.8.6 magrittr_2.0.1
## [79] R6_2.5.0 generics_0.1.0
## [81] DelayedArray_0.16.2 DBI_1.1.1
## [83] pillar_1.5.1 haven_2.3.1
## [85] withr_2.4.1 survival_3.2-7
## [87] RCurl_1.98-1.2 modelr_0.1.8
## [89] crayon_1.4.1 utf8_1.1.4
## [91] tmvnsim_1.0-2 rmarkdown_2.5
## [93] GetoptLong_1.0.5 locfit_1.5-9.4
## [95] grid_4.0.3 readxl_1.3.1
## [97] blob_1.2.1 ConsensusClusterPlus_1.54.0
## [99] reprex_1.0.0 digest_0.6.27
## [101] xtable_1.8-4 munsell_0.5.0